library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")
# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'
sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'
Load SFC and Alameda case data.
# block groups
sf_blockgroups <-
block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
ac_blockgroups <-
block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
sf_bgs <- sf_blockgroups %>% pull(GEOID)
ac_bgs <- ac_blockgroups %>% pull(GEOID)
# zip code areas
zctas_94 <-
zctas(cb = F, starts_with = "94") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_95 <-
zctas(cb = F, starts_with = "95") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_94_95 <- rbind(zctas_94, zctas_95)
# join with block groups
sf_bg_zctas <- sf_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
ac_bg_zctas <- ac_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
# get SF case data
sf_place_cases <-
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
filter(county == 'San Francisco')
# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")
# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
regionString <- paste0("state:06+county:", county)
censusData <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = regionString,
vars = variableToPull
) %>%
mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
return(censusData)
}
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)
# getting population by zip code
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
# join with cases and calculate cases per person
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
rename(zipcode = place)
# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")
# handle the ones that are reported as <10 by calling them 5 cases. Note this is a simplification/choice I made and could be changed.
ac_cases_zip <- ac_place_cases %>%
rename(date = DtCreate) %>%
mutate(date = date %>% substr(1,10) %>% as.Date()) %>%
dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
gather(key = "zipcode", value = "cases", -date) %>%
mutate(cases = ifelse(cases == "<10", "5", cases),
zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
mutate(cases = as.numeric(cases))
# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)
# calculate population by zip code
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
# join with cases
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
mutate(cases_by_pop = cases / total_pop_zip)
# plot Alameda County case data
ac_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
layout(title = "Alameda County")
# also plot SF for comparison
sf_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
layout(title = "San Francisco County")
# plot Alameda and SF together
combined_cases_zip <- rbind(cbind(ac_cases_zip, county_name = "Alameda") %>% dplyr::select(date, cases_by_pop, zipcode, county_name), cbind(sf_cases_zip, county_name = "San Francisco") %>% dplyr::select(date, cases_by_pop, zipcode, county_name))
combined_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode, linetype = ~county_name) %>%
layout(title = "San Francisco and Alameda Counties")
Map the cases by zip code for the two counties.
curr_cases_sf_ac <- combined_cases_zip %>% filter((date == max(sf_cases_zip$date) & county_name == "San Francisco") | (date == max(ac_cases_zip$date) & county_name == "Alameda"))
pal_1 <-
colorNumeric(
palette = "Blues",
domain =
c(0,curr_cases_sf_ac %>%
pull(cases_by_pop) %>%
unique())
)
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = curr_cases_sf_ac %>% left_join(zctas_94_95, by = c("zipcode" = "ZCTA5CE10")) %>% st_as_sf() %>% st_transform(4326),
fill = TRUE,
fillColor = ~pal_1(cases_by_pop),
color = "white",
weight = 0.5,
opacity = 0.5,
fillOpacity = 0.5,
group = "Cases per person",
label = curr_cases_sf_ac$cases_by_pop*1000 %>% round(4)
) %>%
addLayersControl(
overlayGroups = c("Cases per 1000 people")
)
Note that now zip codes are also listed in the labels that you see when you hover over a point.
visits_start_date <- as.Date("2020-04-16")
visits_end_date <- as.Date("2020-05-24")
cases_min_date <- as.Date("2020-04-21")
# SF first
# get initial cases
sf_init_cases <- sf_cases_zip %>% filter(date == cases_min_date) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases
sf_sd_visits_cases_zip_change <- sf_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date & date <= visits_end_date) %>% # limit date range
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(sf_init_cases) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# Alameda
# get initial cases
ac_init_cases <- ac_cases_zip %>% filter(date == cases_min_date) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases
ac_sd_visits_cases_zip_change <- ac_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date & date <= visits_end_date) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(ac_init_cases) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# plot
# San Francisco
sf_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)
summary(model_sf_visits_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010835 -0.0007015 -0.0003363 0.0004772 0.0018545
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.918e-04 7.176e-04 -0.407 0.6880
## visits_per_capita 6.773e-05 3.810e-05 1.778 0.0887 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0008994 on 23 degrees of freedom
## Multiple R-squared: 0.1208, Adjusted R-squared: 0.08256
## F-statistic: 3.16 on 1 and 23 DF, p-value: 0.0887
sf_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)
summary(model_sf_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48337 -0.21345 -0.06558 0.10117 1.98881
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.429974 0.387456 1.110 0.279
## visits_per_capita 0.005519 0.020573 0.268 0.791
##
## Residual standard error: 0.4856 on 23 degrees of freedom
## Multiple R-squared: 0.00312, Adjusted R-squared: -0.04022
## F-statistic: 0.07198 on 1 and 23 DF, p-value: 0.7909
What’s up with that outlier in the change in log of cases plot? It’s zip code 94108. Let’s look at their values.
print(sf_sd_visits_cases_zip_change %>% filter(zipcode == "94108") %>% dplyr::select(confirmed_cases, cases_by_pop, total_pop_zip, init_cases_by_pop))
## # A tibble: 1 x 4
## confirmed_cases cases_by_pop total_pop_zip init_cases_by_pop
## <dbl> <dbl> <dbl> <dbl>
## 1 12 0.000693 17320 0.0000577
It looks like they just started from a really low case number so the case growth just looks very exponential. Actually, were they the minimum initial case number?
print(sf_sd_visits_cases_zip_change$init_cases_by_pop)
## [1] 1.742619e-03 2.729537e-03 1.179245e-03 1.316284e-03 3.637320e-03
## [6] 5.773672e-05 1.045576e-03 2.306105e-03 2.839296e-04 1.557491e-03
## [11] 8.255523e-04 2.259215e-03 4.707887e-04 8.254230e-04 6.137121e-04
## [16] 7.543832e-04 5.720637e-04 1.066710e-03 2.782149e-03 1.001581e-03
## [21] 1.129495e-03 6.316169e-04 6.213223e-04 1.629767e-03 1.140344e-03
Yep, they were, and pretty significantly lower than all the others too. May be reasonable to exclude them then. Try plotting again without them in the log plot.
sf_sd_visits_cases_zip_change_edit <- sf_sd_visits_cases_zip_change %>% filter(zipcode != "94108")
sf_sd_visits_cases_zip_change_edit %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change_edit)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, outlier removed")
model_sf_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change_edit)
summary(model_sf_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change_edit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3447 -0.1535 -0.0036 0.1294 0.4829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.116711 0.176917 -0.660 0.51630
## visits_per_capita 0.030601 0.009279 3.298 0.00328 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2109 on 22 degrees of freedom
## Multiple R-squared: 0.3308, Adjusted R-squared: 0.3004
## F-statistic: 10.87 on 1 and 22 DF, p-value: 0.00328
Now that is more significant!
Alameda:
ac_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)
summary(model_ac_visits_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0013365 -0.0006413 -0.0002422 0.0002693 0.0047075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.385e-04 7.135e-04 -1.315 0.19517
## visits_per_capita 8.124e-05 2.900e-05 2.801 0.00754 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001097 on 44 degrees of freedom
## Multiple R-squared: 0.1513, Adjusted R-squared: 0.1321
## F-statistic: 7.847 on 1 and 44 DF, p-value: 0.007538
ac_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)
summary(model_ac_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.56302 -0.26967 -0.06993 0.25275 1.13273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15033 0.26810 0.561 0.578
## visits_per_capita 0.02414 0.01090 2.215 0.032 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4123 on 44 degrees of freedom
## Multiple R-squared: 0.1003, Adjusted R-squared: 0.07986
## F-statistic: 4.906 on 1 and 44 DF, p-value: 0.03199
Note that the R squared went down from my last version of this–partly because I included a couple university zip codes that I left out last time, and partly since some case counts went up. I also changed the starting date and ending date on visits as well, slightly.
It’s worth checking on the initial case levels for these and excluding any extreme outliers (i.e. that start very low), since we did that for SF.
print(ac_sd_visits_cases_zip_change$init_cases_by_pop)
## [1] 0.0003675883 0.0002993653 0.0003506501 0.0004110937 0.0004635788
## [6] 0.0013582395 0.0014801916 0.0018417789 0.0013664106 0.0013429540
## [11] 0.0003946174 0.0002242926 0.0002688606 0.0003583933 0.0009958409
## [16] 0.0007522190 0.0011886801 0.0009560991 0.0012949260 0.0007977101
## [21] 0.0009038381 0.0007933360 0.0004790929 0.0010115662 0.0004745634
## [26] 0.0011293545 0.0009484374 0.0006120802 0.0005821145 0.0004229403
## [31] 0.0002848841 0.0003949447 0.0007014028 0.0006825939 0.0003691944
## [36] 0.0029313406 0.0015452346 0.0002951594 0.0004711526 0.0001635323
## [41] 0.0004013163 0.0002999580 0.0005510856 0.0005215396 0.0004167709
## [46] 0.0006386512
None of those look unreasonable, actually.
Maybe we should also identify the university zip codes?
ac_sd_visits_cases_zip_change_univ_mark <- ac_sd_visits_cases_zip_change %>%
mutate(univ = zipcode %in% univ_zips)
ac_sd_visits_cases_zip_change_univ_mark %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~univ) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change_univ_mark)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, univ zips marked")
Worth looking at.
Plot both on the same graph. Note that I excluded the SF outlier.
ac_sf_sd_visits_cases_zip_change <- rbind(ac_sd_visits_cases_zip_change %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda'), sf_sd_visits_cases_zip_change_edit %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco'))
ac_sf_sd_visits_cases_zip_change %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)
summary(model_combined_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5443 -0.2429 -0.0679 0.1839 1.1529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.046845 0.166364 -0.282 0.779
## visits_per_capita 0.030776 0.007279 4.228 7.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.358 on 68 degrees of freedom
## Multiple R-squared: 0.2081, Adjusted R-squared: 0.1965
## F-statistic: 17.87 on 1 and 68 DF, p-value: 7.207e-05
Lower R squared than SF on its own, higher than Alameda on its own, but p value more significant than either on their own.
Also do not log just to compare.
ac_sf_sd_visits_cases_zip_change %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in cases per person from 4/21-present'), title = "Visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)
summary(model_combined_visits_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0013328 -0.0006430 -0.0002723 0.0002319 0.0046490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.387e-04 4.827e-04 -0.909 0.36671
## visits_per_capita 6.469e-05 2.112e-05 3.063 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001039 on 68 degrees of freedom
## Multiple R-squared: 0.1212, Adjusted R-squared: 0.1083
## F-statistic: 9.38 on 1 and 68 DF, p-value: 0.003141
Try looking at change in cases from 4/21 to 5/12, and 5/12 to present, and visits from 4/16 through 5/6 and 5/7 through 5/24.
visits_start_date_1 <- as.Date("2020-04-16")
visits_end_date_1 <- as.Date("2020-05-06")
cases_end_date_1 <- as.Date("2020-05-12")
visits_end_date_2 <- as.Date("2020-05-24")
# SF first
# first initial cases are the same as last time
sf_init_cases_1 <- sf_init_cases
# second initial cases start the end date of first segment
sf_init_cases_2 <- sf_cases_zip %>% filter(date == cases_end_date_1) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases, first segment
sf_sd_visits_cases_zip_change_1 <- sf_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date_1 & date <= visits_end_date_1) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(sf_cases_zip %>% filter(date == cases_end_date_1) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(sf_init_cases_1) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# summarize and add current and initial cases, second segment
sf_sd_visits_cases_zip_change_2 <- sf_sd_visits_cases_zip_by_date %>%
filter(date > visits_end_date_1 & date <= visits_end_date_2) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(sf_init_cases_2) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# Alameda
# first initial cases are the same as last time
ac_init_cases_1 <- ac_init_cases
# second initial cases start the end date of first segment
ac_init_cases_2 <- ac_cases_zip %>% filter(date == cases_end_date_1) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases, first segment
ac_sd_visits_cases_zip_change_1 <- ac_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date_1 & date <= visits_end_date_1) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(ac_cases_zip %>% filter(date == cases_end_date_1) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(ac_init_cases_1) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# summarize and add current and initial cases, second segment
ac_sd_visits_cases_zip_change_2 <- ac_sd_visits_cases_zip_by_date %>%
filter(date > visits_end_date_1 & date <= visits_end_date_2) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(ac_init_cases_2) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# combine
ac_sf_sd_visits_cases_zip_change_2_chunks <- rbind(ac_sd_visits_cases_zip_change_1 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda', time_chunk = 1), ac_sd_visits_cases_zip_change_2 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda', time_chunk = 2), sf_sd_visits_cases_zip_change_1 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco', time_chunk = 1), sf_sd_visits_cases_zip_change_2 %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco', time_chunk = 2))
ac_sf_sd_visits_cases_zip_change_2_chunks %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county, symbol = ~time_chunk) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'change in log(cases per person)'), title = "Visits/person and change in cases/person, SF and Alameda, 2 time chunks")
model_combined_visits_cases_change_log_2_chunks <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks)
summary(model_combined_visits_cases_change_log_2_chunks)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change_2_chunks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41012 -0.19040 -0.08574 0.11898 2.28188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.064736 0.091765 0.705 0.48170
## visits_per_capita 0.024129 0.008038 3.002 0.00318 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3021 on 140 degrees of freedom
## Multiple R-squared: 0.06047, Adjusted R-squared: 0.05376
## F-statistic: 9.01 on 1 and 140 DF, p-value: 0.00318
Remove that one SF zipcode again.
ac_sf_sd_visits_cases_zip_change_2_chunks_edit <- ac_sf_sd_visits_cases_zip_change_2_chunks %>% filter(zipcode != "94108")
ac_sf_sd_visits_cases_zip_change_2_chunks_edit %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county, symbol = ~time_chunk) %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks_edit)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person'), yaxis = list(title = 'change in log(cases per person)'), title = "Visits/person and change in cases/person, SF and Alameda, 2 time chunks")
model_combined_visits_cases_change_log_2_chunks_2 <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change_2_chunks_edit)
summary(model_combined_visits_cases_change_log_2_chunks_2)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change_2_chunks_edit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42199 -0.16292 -0.07314 0.13109 0.69572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.039915 0.072014 -0.554 0.58
## visits_per_capita 0.032269 0.006276 5.142 9.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2316 on 138 degrees of freedom
## Multiple R-squared: 0.1608, Adjusted R-squared: 0.1547
## F-statistic: 26.44 on 1 and 138 DF, p-value: 9.136e-07
Interesting, this doesn’t quite fit as well. But still significant. Might have to do with less time for cases to grow overall? A lot don’t have changes in this graph, which could be altering the results.
Looking at average household size and case data. Note that this is a weighted mean of the average household size.
# block group populations
sf_pop_bg <- pullCensus("B01003_001E", sf_fips) %>%
rename(total_pop = B01003_001E)
ac_pop_bg <- pullCensus("B01003_001E", ac_fips) %>%
rename(total_pop = B01003_001E)
sf_avg_household_size <- pullCensus("B25010_001E", sf_fips) %>%
filter(B25010_001E > 0) %>% # some had negative values
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
left_join(sf_pop_bg) %>%
group_by(zipcode) %>%
summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
filter(!is.na(zipcode))
ac_avg_household_size <- pullCensus("B25010_001E", ac_fips) %>%
filter(B25010_001E > 0) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
left_join(ac_pop_bg) %>%
group_by(zipcode) %>%
summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
filter(!is.na(zipcode))
# correlate with case data - current cases
sf_cases_house_size_zip <- left_join(sf_cases_zip %>% filter(date == max(date)), sf_avg_household_size) %>% filter(!is.na(avg_household_size))
sf_cases_house_size_zip %>% plot_ly() %>%
add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, sf_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "San Francisco County")
sf_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, sf_cases_house_size_zip)
summary(sf_model_household_size_cases)
##
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = sf_cases_house_size_zip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0018051 -0.0012945 -0.0007524 0.0005495 0.0033981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0009224 0.0014445 0.639 0.529
## avg_household_size 0.0005546 0.0005950 0.932 0.361
##
## Residual standard error: 0.001716 on 23 degrees of freedom
## Multiple R-squared: 0.0364, Adjusted R-squared: -0.005501
## F-statistic: 0.8687 on 1 and 23 DF, p-value: 0.361
# alameda
ac_cases_house_size_zip <- left_join(ac_cases_zip %>% filter(date == max(date)), ac_avg_household_size) %>% filter(!is.na(avg_household_size))
ac_cases_house_size_zip %>% plot_ly() %>%
add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, ac_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "Alameda County")
ac_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, ac_cases_house_size_zip)
summary(ac_model_household_size_cases)
##
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = ac_cases_house_size_zip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0019174 -0.0009146 -0.0001070 0.0007149 0.0042124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0029958 0.0012537 -2.390 0.021219 *
## avg_household_size 0.0016881 0.0004395 3.841 0.000389 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001346 on 44 degrees of freedom
## Multiple R-squared: 0.2511, Adjusted R-squared: 0.2341
## F-statistic: 14.75 on 1 and 44 DF, p-value: 0.0003895
Interesting–a significant relationship in Alameda but not San Francisco. This adds to the story that is seeming to build that SF might be different from other places.
Just to compare, because I’m now curious, going to use the data from SCC too.
# block groups
scc_blockgroups <-
block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
scc_bgs <- scc_blockgroups %>% pull(GEOID)
# join with zip codes
scc_bg_zctas <- scc_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
# get case data
scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
scc_cases_zip <- scc_map_cases %>%
dplyr::select(zipcode, Cases, Population) %>%
st_drop_geometry() %>%
rename(cases = Cases, pop = Population) %>%
mutate(cases = replace_na(cases, 0)) %>%
mutate(cases_by_pop = cases/pop) %>%
filter(is.numeric(cases_by_pop))
# get population data
scc_fips <- fips("CA", "Santa Clara") %>% substr(3,5)
scc_pop_zip <- pullCensus("B01003_001E", scc_fips) %>%
left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
scc_pop_bg <- pullCensus("B01003_001E", scc_fips) %>%
rename(total_pop = B01003_001E)
scc_avg_household_size <- pullCensus("B25010_001E", scc_fips) %>%
filter(B25010_001E > 0) %>%
left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
left_join(scc_pop_bg) %>%
group_by(zipcode) %>%
summarize(avg_household_size = weighted.mean(B25010_001E, total_pop)) %>%
filter(!is.na(zipcode))
# correlate with case data - current cases
scc_cases_house_size_zip <- left_join(scc_cases_zip, scc_avg_household_size) %>% filter(!is.na(avg_household_size))
scc_cases_house_size_zip %>% plot_ly() %>%
add_trace(x = ~avg_household_size, y = ~cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~avg_household_size, y = fitted(lm(cases_by_pop ~ avg_household_size, scc_cases_house_size_zip)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Population weighted avg household size'), yaxis = list(title = 'Current cases per person'), title = "Santa Clara County")
scc_model_household_size_cases <- lm(cases_by_pop ~ avg_household_size, scc_cases_house_size_zip)
summary(scc_model_household_size_cases)
##
## Call:
## lm(formula = cases_by_pop ~ avg_household_size, data = scc_cases_house_size_zip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0017069 -0.0004330 -0.0000825 0.0001695 0.0038324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0007294 0.0006548 -1.114 0.27031
## avg_household_size 0.0006462 0.0002144 3.015 0.00394 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000879 on 53 degrees of freedom
## Multiple R-squared: 0.1464, Adjusted R-squared: 0.1303
## F-statistic: 9.088 on 1 and 53 DF, p-value: 0.003943
Again more of what we would expect. SF does seem to be odd.
First get the areas of the POIs.
poi_areas <- read_csv('/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/geo-supplement/May2020Release/SafeGraphPlacesGeoSupplementSquareFeet.csv.gz')
poi_areas_selected <- poi_areas %>% dplyr::select(safegraph_place_id, area_square_feet)
I need hourly visits for each day to do this. That unfortunately takes me quite a while to run, so we’ll see how this goes. Going to limit it to 2 weeks at a time for now, starting with just the ones relevant for the previous analyses I was doing.
# files_to_process <- list.files(paste0(sg_hourly_path, "ByDay/"), pattern = "day")
#
# files_to_process_selected <- files_to_process[64:77] # 5/11-5/24
#
# bay_hourly_visits_places <- NULL
# ac_hourly_visits_zip <- NULL
# sf_hourly_visits_zip <- NULL
# for (i in 1:length(files_to_process_selected)) {
# curr_bay_hourly_origins <- readRDS(paste0(sg_hourly_path, "ByDay/", files_to_process_selected[i])) %>% dplyr::select(safegraph_place_id, date, hour, visit_counts_hourly_high, visit_counts_hourly_low, origin_census_block_group, pop) %>%
# filter(!is.na(pop))
#
# # get hourly visits to each safegraph place
# curr_bay_hourly_visits_places <- curr_bay_hourly_origins %>%
# group_by(safegraph_place_id, hour, date) %>%
# summarize(total_visits_hourly_high = sum(visit_counts_hourly_high), total_visits_hourly_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_hourly_avg = (total_visits_hourly_high + total_visits_hourly_low) / 2) %>%
# ungroup()
#
# # get hourly visits by each zip code to each place in Alameda
# curr_ac_hourly_visits_zip <- curr_bay_hourly_origins %>% filter(origin_census_block_group %in% ac_bgs) %>%
# left_join(ac_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date, hour, safegraph_place_id) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#
# # get hourly visits by each zip code to each place in San Francisco
# curr_sf_hourly_visits_zip <- curr_bay_hourly_origins %>% filter(origin_census_block_group %in% sf_bgs) %>%
# left_join(sf_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date, hour, safegraph_place_id) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#
# # combine
# bay_hourly_visits_places <- rbind(bay_hourly_visits_places, curr_bay_hourly_visits_places)
# ac_hourly_visits_zip <- rbind(ac_hourly_visits_zip, curr_ac_hourly_visits_zip)
# sf_hourly_visits_zip <- rbind(sf_hourly_visits_zip, curr_sf_hourly_visits_zip)
# }
#
# # save the rds files
# saveRDS(bay_hourly_visits_places, paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# saveRDS(ac_hourly_visits_zip, paste0(sg_hourly_path, "ac_hourly_visits_zip_5-11_5-24.rds"))
# saveRDS(sf_hourly_visits_zip, paste0(sg_hourly_path, "sf_hourly_visits_zip_5-11_5-24.rds"))
# # join the visits to places files with the areas and calculate visit/area ratios
# bay_hourly_visits_places_413_426 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
# bay_hourly_visits_places_427_510 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
# bay_hourly_visits_places_511_524 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
#
# # left join with areas
# bay_hourly_visits_places_413_426 <- bay_hourly_visits_places_413_426 %>% left_join(poi_areas_selected) %>%
# mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
# bay_hourly_visits_places_427_510 <- bay_hourly_visits_places_427_510 %>% left_join(poi_areas_selected) %>%
# mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
# bay_hourly_visits_places_511_524 <- bay_hourly_visits_places_511_524 %>% left_join(poi_areas_selected) %>%
# mutate(hourly_visits_per_area = total_visits_hourly_avg / area_square_feet)
#
# # re-save the RDS files
# saveRDS(bay_hourly_visits_places_413_426, paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
# saveRDS(bay_hourly_visits_places_427_510, paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
# saveRDS(bay_hourly_visits_places_511_524, paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# load the files now that they have been processed
# load places and weights
bay_hourly_visits_places_413_426 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-13_4-26.rds"))
bay_hourly_visits_places_427_510 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_4-27_5-10.rds"))
bay_hourly_visits_places_511_524 <- readRDS(paste0(sg_hourly_path, "bay_hourly_visits_places_5-11_5-24.rds"))
# load visits by origin
ac_hourly_visits_zip_413_426 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-13_4-26.rds"))
ac_hourly_visits_zip_427_510 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_4-27_5-10.rds"))
ac_hourly_visits_zip_511_524 <- readRDS(paste0(sg_hourly_path, "ac_hourly_visits_zip_5-11_5-24.rds"))
sf_hourly_visits_zip_413_426 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_4-13_4-26.rds"))
sf_hourly_visits_zip_427_510 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_4-27_5-10.rds"))
sf_hourly_visits_zip_511_524 <- readRDS(paste0(sg_hourly_path, "sf_hourly_visits_zip_5-11_5-24.rds"))
# now add the weights (visits per hour per area) to the visits data by origin
ac_hourly_visits_zip_413_426_weights <- left_join(ac_hourly_visits_zip_413_426, bay_hourly_visits_places_413_426 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
ac_hourly_visits_zip_427_510_weights <- left_join(ac_hourly_visits_zip_427_510, bay_hourly_visits_places_427_510 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
ac_hourly_visits_zip_511_524_weights <- left_join(ac_hourly_visits_zip_511_524, bay_hourly_visits_places_511_524 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
# SF
sf_hourly_visits_zip_413_426_weights <- left_join(sf_hourly_visits_zip_413_426, bay_hourly_visits_places_413_426 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
sf_hourly_visits_zip_427_510_weights <- left_join(sf_hourly_visits_zip_427_510, bay_hourly_visits_places_427_510 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
sf_hourly_visits_zip_511_524_weights <- left_join(sf_hourly_visits_zip_511_524, bay_hourly_visits_places_511_524 %>% dplyr::select(safegraph_place_id, hour, date, total_visits_hourly_avg, area_square_feet, hourly_visits_per_area) %>% rename(place_visits_per_area = hourly_visits_per_area, place_total_visits_avg = total_visits_hourly_avg)) %>%
mutate(weighted_visits_by_ratio = total_visits_avg * place_visits_per_area, weighted_visits_by_ratio_sqr = total_visits_avg * (place_visits_per_area)^2)
For clarity, here is a table exemplifying the data I am using. weighted_visits_by_ratio is the visits by that zip code multiplied by the number of total visits to that location over the location’s area, weighed_visits_by_ratio_sqr is the same except the (total visits to that location over the location’s area) is squared.
kable(ac_hourly_visits_zip_413_426_weights %>% dplyr::select(zipcode, date, hour, safegraph_place_id, total_visits_avg, place_total_visits_avg, area_square_feet, place_visits_per_area, weighted_visits_by_ratio, weighted_visits_by_ratio_sqr) %>%
.[1:50,])
| zipcode | date | hour | safegraph_place_id | total_visits_avg | place_total_visits_avg | area_square_feet | place_visits_per_area | weighted_visits_by_ratio | weighted_visits_by_ratio_sqr |
|---|---|---|---|---|---|---|---|---|---|
| 94501 | 2020-04-13 | 1 | sg:04c6d02363974bd38d1c2a4bee4c32a4 | 0.2919753 | 39.426037 | 1374160 | 0.0000287 | 0.0000084 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:078a04c8a7fb4656a543e665527ea159 | 1.3666995 | 29.431109 | 570196 | 0.0000516 | 0.0000705 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:0a2d5058e3714b7a8b4d953bf0adf62f | 1.2142213 | 36.756980 | 95093129 | 0.0000004 | 0.0000005 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:0fb0ca4cde7c4ef4b905340d1faeacfb | 0.3396178 | 36.774126 | 151676 | 0.0002425 | 0.0000823 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:111ab24c7047429585c803eb7ae99bdb | 0.1609400 | 32.990999 | 561521 | 0.0000588 | 0.0000095 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:1150eab79ba14eaea07dd966f27a5e85 | 36.9444864 | 36.944486 | 3450 | 0.0107085 | 0.3956218 | 0.0042365 |
| 94501 | 2020-04-13 | 1 | sg:1509cc41681349d0b7a622cb635aac5c | 1.1559252 | 34.330875 | 346612 | 0.0000990 | 0.0001145 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:1735062c1ee34df7bb39127cb912ce66 | 0.7336302 | 32.317342 | 114621 | 0.0002819 | 0.0002068 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:1941a94853df45aeb756f2026ca0ffe4 | 6.4515969 | 30.776152 | 310790 | 0.0000990 | 0.0006389 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:1eeea35698bf486d8cb08a371f3eb823 | 0.9040179 | 19.608160 | 12415 | 0.0015794 | 0.0014278 | 0.0000023 |
| 94501 | 2020-04-13 | 1 | sg:22a5672c0bfb4cae874ffcc90324b400 | 0.2930957 | 31.897638 | 457493 | 0.0000697 | 0.0000204 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:29193f4b97c64ec28437d2349bf40c70 | 1.4501053 | 36.556157 | 815391 | 0.0000448 | 0.0000650 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:32006d4bb5ea43038f1e7c969f64418e | 2.1093750 | 14.514719 | 3293 | 0.0044077 | 0.0092976 | 0.0000410 |
| 94501 | 2020-04-13 | 1 | sg:373642ecded84befa5298c040ef42945 | 0.7101977 | 46.284456 | 13615965 | 0.0000034 | 0.0000024 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:3929f806d3214533b703755ff589263f | 0.3637460 | 42.320334 | 172807 | 0.0002449 | 0.0000891 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:42829ee300c543f88dc7ed5b199dce40 | 9.7147059 | 53.968805 | 2028 | 0.0266118 | 0.2585262 | 0.0068799 |
| 94501 | 2020-04-13 | 1 | sg:42928ac18b3647158af774c1c3555ef6 | 6.4084084 | 21.955688 | 2851 | 0.0077010 | 0.0493515 | 0.0003801 |
| 94501 | 2020-04-13 | 1 | sg:4765d7f23a544d00b47da92ca70ee08b | 5.9206349 | 37.184924 | 63055 | 0.0005897 | 0.0034915 | 0.0000021 |
| 94501 | 2020-04-13 | 1 | sg:47be1a236aa446158486563d1b5af240 | 1.4603365 | 25.062989 | 25590 | 0.0009794 | 0.0014303 | 0.0000014 |
| 94501 | 2020-04-13 | 1 | sg:52f794242e8f45dfa843623708ca360e | 0.8445084 | 66.721845 | 703859 | 0.0000948 | 0.0000801 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:5d44cabbaf844c69ac94500fb5a157a6 | 17.5000000 | 17.500000 | 4049 | 0.0043221 | 0.0756360 | 0.0003269 |
| 94501 | 2020-04-13 | 1 | sg:66a3f924cbff40f890e94d3e0fe4048e | 5.3212670 | 12.940193 | 12426 | 0.0010414 | 0.0055415 | 0.0000058 |
| 94501 | 2020-04-13 | 1 | sg:68a4065d338e4ec1beda445a6625d371 | 24.2647059 | 33.962773 | 1351 | 0.0251390 | 0.6099902 | 0.0153345 |
| 94501 | 2020-04-13 | 1 | sg:6bed7ca0592c4cceab22a0e301b04580 | 2.6185345 | 13.474323 | 33938 | 0.0003970 | 0.0010396 | 0.0000004 |
| 94501 | 2020-04-13 | 1 | sg:734b5b7d28614c2baeb0ea7ddb24b527 | 4.7448815 | 31.299596 | 228746 | 0.0001368 | 0.0006492 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:767fabd815164a438afc6485ab527988 | 2.0813665 | 46.353935 | 55298 | 0.0008383 | 0.0017447 | 0.0000015 |
| 94501 | 2020-04-13 | 1 | sg:76d9ffcc840649cc96855014177a1bb5 | 3.5387755 | 14.029823 | 117220 | 0.0001197 | 0.0004235 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:7712a59c880547b1b1f224efdcfd70b0 | 0.3804538 | 29.175450 | 522829 | 0.0000558 | 0.0000212 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:82fe8855e6d8463fa00babdd3ed68459 | 0.7271142 | 23.125278 | 1005 | 0.0230102 | 0.0167311 | 0.0003850 |
| 94501 | 2020-04-13 | 1 | sg:8e589cc810534e4bbc77b93c57464c19 | 1.1520241 | 34.247811 | 943769 | 0.0000363 | 0.0000418 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:8fb6d7cba90447909e7999441edaa6d8 | 1.6071429 | 25.887155 | 129580 | 0.0001998 | 0.0003211 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:91d2eacfbfb049558c8cb490540182d4 | 7.3355482 | 28.378668 | 9886 | 0.0028706 | 0.0210574 | 0.0000604 |
| 94501 | 2020-04-13 | 1 | sg:9a368d0076254cfeb0f6aa71e327b060 | 1.3560268 | 6.085938 | 28822 | 0.0002112 | 0.0002863 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:9a396f32f2d748efb313c54ca31a90f2 | 2.4860491 | 34.736260 | 52092 | 0.0006668 | 0.0016578 | 0.0000011 |
| 94501 | 2020-04-13 | 1 | sg:aef80f68e47c4a5aa5b2ac569865f0c1 | 0.8728448 | 51.025657 | 24943 | 0.0020457 | 0.0017856 | 0.0000037 |
| 94501 | 2020-04-13 | 1 | sg:b55da1b654de4f7ebe896497feaef6b1 | 13.8468182 | 27.212177 | 12036 | 0.0022609 | 0.0313063 | 0.0000708 |
| 94501 | 2020-04-13 | 1 | sg:c39e5d0bbc5040fe9e93b8cc08610983 | 17.5325581 | 21.192558 | 4897 | 0.0043277 | 0.0758750 | 0.0003284 |
| 94501 | 2020-04-13 | 1 | sg:c4667ef299e945e081f0fba860078190 | 13.1452991 | 13.145299 | 8805185 | 0.0000015 | 0.0000196 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:c4a1ebeb9f714cf0abd43b1f9df10b1a | 38.5280000 | 154.466539 | 32373 | 0.0047715 | 0.1838349 | 0.0008772 |
| 94501 | 2020-04-13 | 1 | sg:cb9a47c37a804f329e55310b2657a83d | 1.9480620 | 11.351733 | 17528 | 0.0006476 | 0.0012616 | 0.0000008 |
| 94501 | 2020-04-13 | 1 | sg:d795296c3f834841a393f167f0c0ac54 | 3.1308140 | 19.879307 | 7729 | 0.0025720 | 0.0080526 | 0.0000207 |
| 94501 | 2020-04-13 | 1 | sg:d8f7933062f24dd0bc0e154c7d4ca084 | 1.2678725 | 18.227776 | 3340 | 0.0054574 | 0.0069193 | 0.0000378 |
| 94501 | 2020-04-13 | 1 | sg:dd29ebfcd56f4be58c99edf56ae0a322 | 38.6842105 | 38.684210 | 7725 | 0.0050077 | 0.1937176 | 0.0009701 |
| 94501 | 2020-04-13 | 1 | sg:ddfd196c3e6b44cea480a0d591b74c17 | 10.0426597 | 38.014404 | 1376 | 0.0276267 | 0.2774460 | 0.0076649 |
| 94501 | 2020-04-13 | 1 | sg:ee389fc5cf294b15bcdc5968a4962d40 | 0.5472127 | 33.266603 | 2375819 | 0.0000140 | 0.0000077 | 0.0000000 |
| 94501 | 2020-04-13 | 1 | sg:f1c51f98474e4e0889fba192e5d00a81 | 1.3372093 | 23.304180 | 114621 | 0.0002033 | 0.0002719 | 0.0000001 |
| 94501 | 2020-04-13 | 1 | sg:f235487e0c5e4cb19cf3bf9856393596 | 4.2613857 | 33.709096 | 4606 | 0.0073185 | 0.0311870 | 0.0002282 |
| 94501 | 2020-04-13 | 1 | sg:f460dad0a8694944b6401265e0708bdb | 19.2053372 | 34.872349 | 97778 | 0.0003566 | 0.0068495 | 0.0000024 |
| 94501 | 2020-04-13 | 2 | sg:01280c43a0e54530b6182b24b66fdff8 | 0.4576760 | 20.068828 | 44874 | 0.0004472 | 0.0002047 | 0.0000001 |
| 94501 | 2020-04-13 | 2 | sg:02ad023976754a07bdd0c95ebbc59d6b | 24.1659225 | 34.980391 | 14318 | 0.0024431 | 0.0590399 | 0.0001442 |
Summarize for each day and join into a single data frame for each county.
ac_daily_visits_zip_413_524_weights <- rbind(ac_hourly_visits_zip_413_426_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
ac_hourly_visits_zip_427_510_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
ac_hourly_visits_zip_511_524_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr))
)
sf_daily_visits_zip_413_524_weights <- rbind(sf_hourly_visits_zip_413_426_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
sf_hourly_visits_zip_427_510_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr)),
sf_hourly_visits_zip_511_524_weights %>%
group_by(zipcode, date) %>%
summarize(total_weighted_visits_by_ratio = sum(weighted_visits_by_ratio), total_weighted_visits_by_ratio_sqr = sum(weighted_visits_by_ratio_sqr))
)
Can now compare to the previous visits data.
# calculate weighted visits per capita
sf_sd_visits_cases_zip_by_date_w_weighted <- left_join(sf_sd_visits_cases_zip_by_date, sf_daily_visits_zip_413_524_weights) %>%
mutate(weighted_visits_per_capita = total_weighted_visits_by_ratio / total_pop_zip, weighted_visits_ratio_sqr_per_capita = total_weighted_visits_by_ratio_sqr / total_pop_zip)
ac_sd_visits_cases_zip_by_date_w_weighted <- left_join(ac_sd_visits_cases_zip_by_date, ac_daily_visits_zip_413_524_weights) %>%
mutate(weighted_visits_per_capita = total_weighted_visits_by_ratio / total_pop_zip, weighted_visits_ratio_sqr_per_capita = total_weighted_visits_by_ratio_sqr / total_pop_zip)
sf_sd_visits_cases_zip_by_date_w_weighted %>% plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~visits_per_capita, type = 'scatter', mode = 'markers') %>%
layout(title = "San Francisco County")
ac_sd_visits_cases_zip_by_date_w_weighted %>% plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~visits_per_capita, type = 'scatter', mode = 'markers') %>%
layout(title = "Alameda County")
Now look at with cases. Again doing change in cases 4/21-present and visits since 4/16 through 5/24.
# SF first
# summarize and add current and initial cases
sf_sd_visits_cases_zip_change_w_weighted <- sf_sd_visits_cases_zip_by_date_w_weighted %>%
filter(date >= visits_start_date & date <= visits_end_date) %>% # limit date
filter(!is.na(zipcode) & !is.na(total_visits_high) & !is.na(total_weighted_visits_by_ratio)) %>%
group_by(zipcode) %>%
# summarize visits data in this time range
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low),
total_weighted_visits = sum(total_weighted_visits_by_ratio),
total_weighted_visits_by_ratio_sqr = sum(total_weighted_visits_by_ratio_sqr)) %>%
# join with current case levels
left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
# calculate visits per capita
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip,
weighted_visits_per_capita = total_weighted_visits / total_pop_zip,
weighted_visits_per_capita_ratio_sqr = total_weighted_visits_by_ratio_sqr / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
# join with initial case levels
left_join(sf_init_cases) %>%
# get change in cases
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# Alameda
# get initial cases
ac_sd_visits_cases_zip_change_w_weighted <- ac_sd_visits_cases_zip_by_date_w_weighted %>%
filter(date >= visits_start_date & date <= visits_end_date) %>%
filter(!is.na(zipcode) & !is.na(total_visits_high) & !is.na(total_weighted_visits_by_ratio)) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low),
total_weighted_visits = sum(total_weighted_visits_by_ratio),
total_weighted_visits_by_ratio_sqr = sum(total_weighted_visits_by_ratio_sqr)) %>%
left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip,
weighted_visits_per_capita = total_weighted_visits / total_pop_zip,
weighted_visits_per_capita_ratio_sqr = total_weighted_visits_by_ratio_sqr / total_pop_zip) %>%
filter(!is.na(cases_by_pop)) %>%
left_join(ac_init_cases) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# plot
# San Francisco
sf_sd_visits_cases_zip_change_w_weighted %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted <- lm(change_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)
summary(model_sf_visits_cases_change_weighted)
##
## Call:
## lm(formula = change_cases_by_pop ~ weighted_visits_per_capita,
## data = sf_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0011003 -0.0006846 -0.0004876 0.0006605 0.0018655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0002314 0.0006827 0.339 0.738
## weighted_visits_per_capita 0.0035985 0.0033202 1.084 0.290
##
## Residual standard error: 0.0009355 on 23 degrees of freedom
## Multiple R-squared: 0.04859, Adjusted R-squared: 0.007226
## F-statistic: 1.175 on 1 and 23 DF, p-value: 0.2897
sf_sd_visits_cases_zip_change_w_weighted %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted)
summary(model_sf_visits_cases_change_weighted_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita,
## data = sf_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54716 -0.27788 -0.06889 0.13175 1.98223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2610 0.3501 0.746 0.463
## weighted_visits_per_capita 1.3633 1.7024 0.801 0.431
##
## Residual standard error: 0.4797 on 23 degrees of freedom
## Multiple R-squared: 0.02712, Adjusted R-squared: -0.01517
## F-statistic: 0.6413 on 1 and 23 DF, p-value: 0.4314
Again removing the weird zip code, 94108.
sf_sd_visits_cases_zip_change_w_weighted_edit <- sf_sd_visits_cases_zip_change_w_weighted %>% filter(zipcode != "94108")
sf_sd_visits_cases_zip_change_w_weighted_edit %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted_edit)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with simple ratio")
model_sf_visits_cases_change_weighted_log_2 <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, sf_sd_visits_cases_zip_change_w_weighted_edit)
summary(model_sf_visits_cases_change_weighted_log_2)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita,
## data = sf_sd_visits_cases_zip_change_w_weighted_edit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49973 -0.17919 0.00321 0.18088 0.47388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07202 0.17017 0.423 0.6763
## weighted_visits_per_capita 1.89901 0.82316 2.307 0.0309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2313 on 22 degrees of freedom
## Multiple R-squared: 0.1948, Adjusted R-squared: 0.1582
## F-statistic: 5.322 on 1 and 22 DF, p-value: 0.03085
At least more significant.
Compare to using the square of the ratio:
sf_sd_visits_cases_zip_change_w_weighted_edit %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, sf_sd_visits_cases_zip_change_w_weighted_edit)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with square of ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person, weighted with square of ratio")
model_sf_visits_cases_change_weighted_sqr_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, sf_sd_visits_cases_zip_change_w_weighted_edit)
summary(model_sf_visits_cases_change_weighted_sqr_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr,
## data = sf_sd_visits_cases_zip_change_w_weighted_edit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50400 -0.17484 0.00142 0.21739 0.43481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2784 0.1318 2.112 0.0463 *
## weighted_visits_per_capita_ratio_sqr 17.5128 12.4946 1.402 0.1750
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.247 on 22 degrees of freedom
## Multiple R-squared: 0.08198, Adjusted R-squared: 0.04025
## F-statistic: 1.965 on 1 and 22 DF, p-value: 0.175
Less significant with the square of the ratio.
Try Alameda.
ac_sd_visits_cases_zip_change_w_weighted %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with simple ratio")
model_ac_visits_cases_change_weighted <- lm(change_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)
summary(model_ac_visits_cases_change_weighted)
##
## Call:
## lm(formula = change_cases_by_pop ~ weighted_visits_per_capita,
## data = ac_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0019119 -0.0004788 -0.0001427 0.0001931 0.0039120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0017986 0.0007617 -2.361 0.022708 *
## weighted_visits_per_capita 0.0166457 0.0044260 3.761 0.000496 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001036 on 44 degrees of freedom
## Multiple R-squared: 0.2433, Adjusted R-squared: 0.2261
## F-statistic: 14.14 on 1 and 44 DF, p-value: 0.0004964
ac_sd_visits_cases_zip_change_w_weighted %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with simple ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with simple ratio")
model_ac_visits_cases_change_weighted_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sd_visits_cases_zip_change_w_weighted)
summary(model_ac_visits_cases_change_weighted_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita,
## data = ac_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05605 -0.18465 0.05513 0.18785 0.83029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2877 0.2787 -1.032 0.307550
## weighted_visits_per_capita 6.0282 1.6193 3.723 0.000557 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3791 on 44 degrees of freedom
## Multiple R-squared: 0.2395, Adjusted R-squared: 0.2222
## F-statistic: 13.86 on 1 and 44 DF, p-value: 0.0005571
Interesting–the weighting makes the Alameda results more significant than without the weighting, but that isn’t true in SF.
Try with square of ratio.
ac_sd_visits_cases_zip_change_w_weighted %>%
plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode) %>%
add_trace(x = ~weighted_visits_per_capita_ratio_sqr, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, ac_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted (with square of ratio) visit-hours per person, 4/16-5/24'), yaxis = list(title = 'Change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person, weighted with square of ratio")
model_ac_visits_cases_change_weighted_sqr_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr, ac_sd_visits_cases_zip_change_w_weighted)
summary(model_ac_visits_cases_change_weighted_sqr_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita_ratio_sqr,
## data = ac_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7261 -0.2718 -0.0700 0.2180 1.2130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8266 0.1147 7.209 5.65e-09 ***
## weighted_visits_per_capita_ratio_sqr -13.0063 12.6949 -1.025 0.311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4296 on 44 degrees of freedom
## Multiple R-squared: 0.0233, Adjusted R-squared: 0.001102
## F-statistic: 1.05 on 1 and 44 DF, p-value: 0.3112
Again not significant with square of ratio.
Combine SF and Alameda with the ratio weighting.
ac_sf_sd_visits_cases_zip_change_w_weighted <- rbind(ac_sd_visits_cases_zip_change_w_weighted %>% dplyr::select(zipcode, change_log_cases_by_pop, weighted_visits_per_capita, change_cases_by_pop) %>% cbind(county = 'Alameda'), sf_sd_visits_cases_zip_change_w_weighted_edit %>% dplyr::select(zipcode, change_log_cases_by_pop, weighted_visits_per_capita, change_cases_by_pop) %>% cbind(county = 'San Francisco'))
ac_sf_sd_visits_cases_zip_change_w_weighted %>% plot_ly() %>%
add_trace(x = ~weighted_visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers', text = ~zipcode, color = ~county) %>%
add_trace(x = ~weighted_visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sf_sd_visits_cases_zip_change_w_weighted)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total weighted visit-hours per person, 4/16-5/24'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Weighted visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change_weighted_log <- lm(change_log_cases_by_pop ~ weighted_visits_per_capita, ac_sf_sd_visits_cases_zip_change_w_weighted)
summary(model_combined_visits_cases_change_weighted_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ weighted_visits_per_capita,
## data = ac_sf_sd_visits_cases_zip_change_w_weighted)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75954 -0.24441 -0.00452 0.22304 1.17466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2120 0.1859 1.141 0.2580
## weighted_visits_per_capita 2.3523 1.0064 2.337 0.0224 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3871 on 68 degrees of freedom
## Multiple R-squared: 0.07437, Adjusted R-squared: 0.06076
## F-statistic: 5.464 on 1 and 68 DF, p-value: 0.02237
Interesting, far less significant than without weighting.